import pandas as pd
file_path = '/content/slice_localization_data.csv'
df = pd.read_csv(file_path)
print(df.head())
patientId value0 value1 value2 value3 value4 value5 value6 value7 \ 0 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 1 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 2 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 3 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 4 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 value8 ... value375 value376 value377 value378 value379 value380 \ 0 -0.25 ... -0.25 0.980381 0.0 0.0 0.0 0.0 1 -0.25 ... -0.25 0.977008 0.0 0.0 0.0 0.0 2 -0.25 ... -0.25 0.977008 0.0 0.0 0.0 0.0 3 -0.25 ... -0.25 0.977008 0.0 0.0 0.0 0.0 4 -0.25 ... -0.25 0.976833 0.0 0.0 0.0 0.0 value381 value382 value383 reference 0 0.0 -0.25 -0.25 21.803851 1 0.0 -0.25 -0.25 21.745726 2 0.0 -0.25 -0.25 21.687600 3 0.0 -0.25 -0.25 21.629474 4 0.0 -0.25 -0.25 21.571348 [5 rows x 386 columns]
dataset_shape = df.shape
print("Dataset shape:", dataset_shape)
Dataset shape: (53500, 386)
categorical_columns = df.select_dtypes(include='object').columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
categorical_count = len(categorical_columns)
numerical_count = len(numerical_columns)
print("Categorical columns:", categorical_columns)
print("Numerical columns:", numerical_columns)
print("Number of categorical columns:", categorical_count)
print("Number of numerical columns:", numerical_count)
Categorical columns: Index([], dtype='object')
Numerical columns: Index(['patientId', 'value0', 'value1', 'value2', 'value3', 'value4', 'value5',
'value6', 'value7', 'value8',
...
'value375', 'value376', 'value377', 'value378', 'value379', 'value380',
'value381', 'value382', 'value383', 'reference'],
dtype='object', length=386)
Number of categorical columns: 0
Number of numerical columns: 386
# Check for null values
null_counts = df.isnull().sum()
print("Columns with null values:")
for column, count in null_counts.items():
if count > 0:
print(f"{column}: {count} null values")
Columns with null values:
# Replace null values with mode
df_filled = df.fillna(df.mode().iloc[0])
# Check for null values in the filled DataFrame
null_counts_filled = df_filled.isnull().sum()
# Print if there are any null values
if null_counts_filled.any():
print("Null values:")
print(null_counts_filled)
else:
print("No null values found.")
No null values found.
From the above dataset, we can see that variables named as ‘value0’, ‘value1’,.. ‘value383’ contain feature values of CT scan images for each patient. The last variable is ‘reference’. This ‘reference’ is our target variable and it contains the relative location of the CT slice.
df_filled.describe(include='all')
| patientId | value0 | value1 | value2 | value3 | value4 | value5 | value6 | value7 | value8 | ... | value375 | value376 | value377 | value378 | value379 | value380 | value381 | value382 | value383 | reference | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | ... | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 |
| mean | 47.075701 | 0.059627 | 0.071558 | 0.145819 | 0.218728 | 0.274762 | 0.276189 | 0.204531 | 0.062281 | -0.042025 | ... | -0.029404 | 0.182913 | 0.320112 | 0.359373 | 0.342889 | 0.266091 | 0.083049 | -0.031146 | -0.154524 | 47.028039 |
| std | 27.414240 | 0.174243 | 0.196921 | 0.300270 | 0.359163 | 0.378862 | 0.369605 | 0.351294 | 0.292232 | 0.268391 | ... | 0.085817 | 0.383333 | 0.463517 | 0.478188 | 0.471811 | 0.437633 | 0.279734 | 0.098738 | 0.122491 | 22.347042 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.250000 | -0.250000 | -0.250000 | -0.250000 | ... | -0.250000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.250000 | -0.250000 | -0.250000 | 1.738733 |
| 25% | 23.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.250000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.250000 | 29.891607 |
| 50% | 46.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.250000 | 43.987893 |
| 75% | 70.000000 | 0.000000 | 0.000000 | 0.000000 | 0.446429 | 0.684477 | 0.662382 | 0.441412 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.996286 | 0.999677 | 0.999560 | 0.949478 | 0.000000 | 0.000000 | 0.000000 | 63.735059 |
| max | 96.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.998790 | 0.996468 | 0.999334 | 1.000000 | 1.000000 | ... | 0.961279 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.999857 | 0.996839 | 0.942851 | 97.489115 |
8 rows × 386 columns
# Getting unqiue values of the "Patient ID"
import numpy as np
np.unique(df_filled["patientId"])
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96])
print(df_filled.columns)
Index(['patientId', 'value0', 'value1', 'value2', 'value3', 'value4', 'value5',
'value6', 'value7', 'value8',
...
'value375', 'value376', 'value377', 'value378', 'value379', 'value380',
'value381', 'value382', 'value383', 'reference'],
dtype='object', length=386)
#dropping off unnecessary variable ‘patientId’, separating features and target variables.
df_copy = df_filled.drop(['patientId'], axis=1)
df_y = df_copy['reference']
df_x = df_copy.drop(['reference'], axis=1)
# Plot "Reference" column distplot
plt.figure(figsize=(12,8))
sns.distplot(df_y, bins=100)
<ipython-input-61-1c272ca86b41>:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df_y, bins=100)
<Axes: xlabel='reference', ylabel='Density'>
Split the Data
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.3, random_state=42)
# View the split data
print("X_train:")
print(X_train.head())
print("\nX_test:")
print(X_test.head())
print("\ny_train:")
print(y_train.head())
print("\ny_test:")
print(y_test.head())
X_train:
value0 value1 value2 value3 value4 value5 value6 \
19113 0.000000 0.000000 0.000000 0.000000 0.939018 0.965932 0.873580
40279 0.114286 0.020930 0.000000 0.963090 0.680756 0.558228 0.439762
42189 0.000000 0.000000 0.000000 0.751979 0.000000 0.842081 0.904479
30994 0.000000 0.000000 0.914286 0.855310 0.893836 0.000000 0.000000
19373 0.000000 0.267206 0.904605 0.972478 0.856772 0.000000 0.000000
value7 value8 value9 ... value374 value375 value376 \
19113 0.000000 0.000000 -0.25 ... 0.0 0.0 0.000000
40279 0.898336 0.741252 0.00 ... 0.0 0.0 0.000000
42189 0.943744 0.699692 0.00 ... 0.0 0.0 0.994347
30994 0.000000 0.000000 -0.25 ... 0.0 0.0 0.000000
19373 0.000000 0.000000 -0.25 ... 0.0 0.0 0.000000
value377 value378 value379 value380 value381 value382 value383
19113 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
40279 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
42189 0.999256 0.999908 0.997149 0.0 0.0 0.0 0.0
30994 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
19373 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
[5 rows x 384 columns]
X_test:
value0 value1 value2 value3 value4 value5 value6 \
46266 0.00000 0.000000 0.000000 0.000000 0.000000 0.704285 0.0
2778 0.86783 0.898305 0.169761 0.411677 0.950726 0.000000 0.0
34408 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0
13871 0.00000 0.000000 0.000000 0.000000 0.797264 0.390216 0.0
35994 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0
value7 value8 value9 ... value374 value375 value376 value377 \
46266 0.00 -0.25 -0.25 ... 0.0 0.0 0.0 0.797007
2778 -0.25 -0.25 -0.25 ... 0.0 0.0 0.0 0.000000
34408 0.00 -0.25 -0.25 ... 0.0 0.0 0.0 0.999131
13871 0.00 0.00 -0.25 ... 0.0 0.0 0.0 0.000000
35994 0.00 -0.25 -0.25 ... 0.0 0.0 0.0 0.000000
value378 value379 value380 value381 value382 value383
46266 0.999385 0.999884 0.999754 0.0 0.00 -0.25
2778 0.988959 0.985753 0.000000 0.0 -0.25 -0.25
34408 0.999918 0.999959 0.999382 0.0 0.00 -0.25
13871 0.000000 0.000000 0.000000 0.0 0.00 -0.25
35994 0.000000 0.000000 0.000000 0.0 0.00 -0.25
[5 rows x 384 columns]
y_train:
19113 91.005317
40279 26.270760
42189 33.807090
30994 79.645955
19373 81.673863
Name: reference, dtype: float64
y_test:
46266 43.349961
2778 12.119149
34408 39.070125
13871 88.928088
35994 60.022506
Name: reference, dtype: float64
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
#for test dataset
y_pred = lm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('R-squared:', r2)
Mean Squared Error: 68.29434562989341 Mean Absolute Error: 6.123343272084307 R-squared: 0.8624473194580933
#for training dataset
y_pred = lm.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
mae = mean_absolute_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)
print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('R-squared:', r2)
Mean Squared Error: 67.82275427597651 Mean Absolute Error: 6.10251339719793 R-squared: 0.8645188670291543
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
# Define the function to plot learning curves
def plotLearningCurves(X, y, step):
m, n = X.shape
maxVal = (int)(m / 10) * 10
N_size_arr = np.arange(10, maxVal + 10, step)
error_arr = np.zeros((len(N_size_arr), 2)) # Updated line
index = 0
# Fitting Model
lm.fit(X, y)
# Increasing train dataset size, "step" times in each iteration
for i in N_size_arr:
# Splitting Training dataset with size i into train and cross-validation sets
X_train_subset = X_train[:i]
y_train_subset = y_train[:i]
# Computing both mean squared error of the training dataset and cross-validation datasets predictions
error_arr[index, 0] = mean_squared_error(y_train_subset, lm.predict(X_train_subset))
error_arr[index, 1] = mean_squared_error(y_test, lm.predict(X_test))
# Increasing index by 1
index += 1
# Initializing the figure
fig = plt.figure(figsize=(12, 8))
ax = fig.add_axes([0, 0, 1, 1])
ax.set_yscale('log')
# Plotting "Training set size" vs. "Mean Squared Error" for both the training and cross-validation dataset's errors
line1, = ax.plot(N_size_arr, error_arr[:, 0], c='red')
line2, = ax.plot(N_size_arr, error_arr[:, 1], c='blue')
# Adding labels and legends to our plot
ax.set_xlabel("N (Training set size)")
ax.set_ylabel("Mean Squared Error")
ax.legend((line1, line2), ("Train Error", "Test Error"))
# Call the function to plot the learning curves
plotLearningCurves(X_train, y_train, 200)
# Predecting Reference values with the test dataset
y_pred = lm.predict(X_test)
# Plotting predictions vs. y_test
fig = plt.figure(figsize=(10, 6))
ax = fig.add_axes([0, 0, 1, 1])
ax.set_xlabel("Predictions")
ax.set_ylabel("Test Target Variable")
ax.plot(y_test, y_pred, 'bo', ms=1)
# Display the plot
plt.show()
Based on the above information, it appears that the model might be slightly overfitting. Overfitting occurs when a model learns the training data too well and performs poorly on unseen data.
The mean squared error (MSE) and mean absolute error (MAE) on the training dataset are slightly lower than on the test dataset. Additionally, the R-squared value on the training dataset is higher than on the test dataset.
High complexity model: Overfitting can occur when the model is too complex relative to the available data. With a large number of features (386) compared to the number of instances (53500), it's possible that the model has learned noise or irrelevant patterns in the training data.
Feature Standarization
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_df_x = scaler.fit_transform(df_x)
pca = PCA(0.75)
pca_vectors = pca.fit_transform(scaled_df_x)
for index, var in enumerate(pca.explained_variance_ratio_):
print("Explained Variance ratio by Principal Component ", (index+1), " : ", var)
Explained Variance ratio by Principal Component 1 : 0.14855715257210392 Explained Variance ratio by Principal Component 2 : 0.12108312979894122 Explained Variance ratio by Principal Component 3 : 0.06474442177698231 Explained Variance ratio by Principal Component 4 : 0.03774984839961066 Explained Variance ratio by Principal Component 5 : 0.03510333274248252 Explained Variance ratio by Principal Component 6 : 0.025645225528655405 Explained Variance ratio by Principal Component 7 : 0.02330954175071417 Explained Variance ratio by Principal Component 8 : 0.021428275145390387 Explained Variance ratio by Principal Component 9 : 0.017614067903705406 Explained Variance ratio by Principal Component 10 : 0.01589391279292687 Explained Variance ratio by Principal Component 11 : 0.014032726489128131 Explained Variance ratio by Principal Component 12 : 0.012871921022909665 Explained Variance ratio by Principal Component 13 : 0.0121634919283602 Explained Variance ratio by Principal Component 14 : 0.01072092397679328 Explained Variance ratio by Principal Component 15 : 0.009760345345618412 Explained Variance ratio by Principal Component 16 : 0.009482151161428901 Explained Variance ratio by Principal Component 17 : 0.008695231697355841 Explained Variance ratio by Principal Component 18 : 0.008205878792193493 Explained Variance ratio by Principal Component 19 : 0.008041977514589788 Explained Variance ratio by Principal Component 20 : 0.007543652519555233 Explained Variance ratio by Principal Component 21 : 0.006931981984392693 Explained Variance ratio by Principal Component 22 : 0.00646707766389355 Explained Variance ratio by Principal Component 23 : 0.006083596311372858 Explained Variance ratio by Principal Component 24 : 0.005868832669382626 Explained Variance ratio by Principal Component 25 : 0.005741237892731493 Explained Variance ratio by Principal Component 26 : 0.005545363955500639 Explained Variance ratio by Principal Component 27 : 0.0053812466975905315 Explained Variance ratio by Principal Component 28 : 0.005160938036779552 Explained Variance ratio by Principal Component 29 : 0.005084092580391446 Explained Variance ratio by Principal Component 30 : 0.004799621223915218 Explained Variance ratio by Principal Component 31 : 0.004652832065005538 Explained Variance ratio by Principal Component 32 : 0.004574687218755086 Explained Variance ratio by Principal Component 33 : 0.004478718115698722 Explained Variance ratio by Principal Component 34 : 0.004312607244892595 Explained Variance ratio by Principal Component 35 : 0.004197561850896129 Explained Variance ratio by Principal Component 36 : 0.004096225788266026 Explained Variance ratio by Principal Component 37 : 0.004050004696179025 Explained Variance ratio by Principal Component 38 : 0.0040203530745072285 Explained Variance ratio by Principal Component 39 : 0.0039032597942033425 Explained Variance ratio by Principal Component 40 : 0.0037259511080387207 Explained Variance ratio by Principal Component 41 : 0.0036811022474497667 Explained Variance ratio by Principal Component 42 : 0.003532807590921863 Explained Variance ratio by Principal Component 43 : 0.0035221449980117033 Explained Variance ratio by Principal Component 44 : 0.0033683692074793284 Explained Variance ratio by Principal Component 45 : 0.003324213440610176 Explained Variance ratio by Principal Component 46 : 0.0032417350414558144 Explained Variance ratio by Principal Component 47 : 0.003097464371346726 Explained Variance ratio by Principal Component 48 : 0.0030493180022711022 Explained Variance ratio by Principal Component 49 : 0.0030174010370553972 Explained Variance ratio by Principal Component 50 : 0.002893053408362604 Explained Variance ratio by Principal Component 51 : 0.002831606959181239 Explained Variance ratio by Principal Component 52 : 0.002775687439238411
we are able to reduce dimensions from 384 to 52
import numpy as np
import matplotlib.pyplot as plt
# Calculate the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
# Calculate the cumulative explained variance
cumulative_variance = np.cumsum(explained_variance_ratio)
# Plot the explained variance ratio and cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', label='Explained Variance Ratio')
plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_variance, marker='o', label='Cumulative Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio / Cumulative Explained Variance')
plt.title('Explained Variance Ratio and Cumulative Explained Variance')
plt.legend()
plt.show()
pca_vectors = pca.fit_transform(scaled_df_x)
# Access and print the pca_vectors
print("PCA Vectors:")
print(pca_vectors)
PCA Vectors: [[ 1.66418097e+01 -5.28637854e+00 5.90813596e+00 ... 1.47466546e+00 -6.07350165e-03 7.87119195e-01] [ 1.65943085e+01 -4.94762736e+00 6.12864705e+00 ... 1.18524823e+00 -1.03928883e-01 1.02606483e+00] [ 1.65927230e+01 -4.91301981e+00 6.17570643e+00 ... 1.15494987e+00 -1.00923531e-01 8.95570094e-01] ... [-4.79214559e+00 1.42699675e+01 5.57219567e-01 ... -7.52523437e-01 1.66459189e-01 7.02028650e-02] [ 1.73465338e+01 -3.01729764e+00 6.15848484e+00 ... -5.72477971e-01 -1.88995484e-01 1.73028394e+00] [ 1.75212840e+01 -2.49155501e+00 5.31833961e+00 ... -4.57566760e-01 -3.26204443e-01 1.49666375e+00]]
import seaborn as sns
# Select a subset of principal components
subset_pca = pca_vectors[:, :20] # Adjust the number of components as desired
# Create a DataFrame with the subset of principal components
df_subset_pca = pd.DataFrame(subset_pca, columns=[f'Principal Component {i+1}' for i in range(subset_pca.shape[1])])
# Create a scatter plot matrix
sns.set(style='ticks')
sns.pairplot(df_subset_pca)
plt.suptitle('Scatter Plot Matrix of Subset Principal Components')
plt.show()